1 /* 2 * Copyright (c) 1999, 2007, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 */ 28 29 /* 30 * Licensed Materials - Property of IBM 31 * 32 * (C) Copyright IBM Corp. 1999 All Rights Reserved. 33 * (C) IBM Corp. 1997-1998. All Rights Reserved. 34 * 35 * The program is provided "as is" without any warranty express or 36 * implied, including the warranty of non-infringement and the implied 37 * warranties of merchantibility and fitness for a particular purpose. 38 * IBM will not be liable for any damages suffered by you as a result 39 * of using the Program. In no event will IBM be liable for any 40 * special, indirect or consequential damages or lost profits even if 41 * IBM has been advised of the possibility of their occurrence. IBM 42 * will not be liable for any third party claims against you. 43 */ 44 45 package sun.text.resources; 46 47 import java.util.ListResourceBundle; 48 49 /** 50 * Default break-iterator rules. These rules are more or less general for 51 * all locales, although there are probably a few we're missing. The 52 * behavior currently mimics the behavior of BreakIterator in JDK 1.2. 53 * There are known deficiencies in this behavior, including the fact that 54 * the logic for handling CJK characters works for Japanese but not for 55 * Chinese, and that we don't currently have an appropriate locale for 56 * Thai. The resources will eventually be updated to fix these problems. 57 */ 58 59 /* Modified for Hindi 3/1/99. */ 60 61 /* 62 * Since JDK 1.5.0, this file no longer goes to runtime and is used at J2SE 63 * build phase in order to create [Character|Word|Line|Sentence]BreakIteratorData 64 * files which are used on runtime instead. 65 */ 66 67 public class BreakIteratorRules extends ListResourceBundle { 68 protected final Object[][] getContents() { 69 return new Object[][] { 70 // rules describing how to break between logical characters 71 { "CharacterBreakRules", 72 73 // ignore non-spacing marks and enclosing marks (since we never 74 // put a break before ignore characters, this keeps combining 75 // accents with the base characters they modify) 76 "<enclosing>=[:Mn::Me:];" 77 78 // other category definitions 79 + "<choseong>=[\u1100-\u115f];" 80 + "<jungseong>=[\u1160-\u11a7];" 81 + "<jongseong>=[\u11a8-\u11ff];" 82 + "<surr-hi>=[\ud800-\udbff];" 83 + "<surr-lo>=[\udc00-\udfff];" 84 85 // break after every character, except as follows: 86 + ".;" 87 88 // keep base and combining characters togethers 89 + "<base>=[^<enclosing>^[:Cc::Cf::Zl::Zp:]];" 90 + "<base><enclosing><enclosing>*;" 91 92 // keep CRLF sequences together 93 + "\r\n;" 94 95 // keep surrogate pairs together 96 + "<surr-hi><surr-lo>;" 97 98 // keep Hangul syllables spelled out using conjoining jamo together 99 + "<choseong>*<jungseong>*<jongseong>*;" 100 101 // various additions for Hindi support 102 + "<nukta>=[\u093c];" 103 + "<danda>=[\u0964\u0965];" 104 + "<virama>=[\u094d];" 105 + "<devVowelSign>=[\u093e-\u094c\u0962\u0963];" 106 + "<devConsonant>=[\u0915-\u0939];" 107 + "<devNuktaConsonant>=[\u0958-\u095f];" 108 + "<devCharEnd>=[\u0902\u0903\u0951-\u0954];" 109 + "<devCAMN>=(<devConsonant>{<nukta>});" 110 + "<devConsonant1>=(<devNuktaConsonant>|<devCAMN>);" 111 + "<zwj>=[\u200d];" 112 + "<devConjunct>=({<devConsonant1><virama>{<zwj>}}<devConsonant1>);" 113 + "<devConjunct>{<devVowelSign>}{<devCharEnd>};" 114 + "<danda><nukta>;" 115 }, 116 117 // default rules for finding word boundaries 118 { "WordBreakRules", 119 // ignore non-spacing marks, enclosing marks, and format characters, 120 // all of which should not influence the algorithm 121 //"<ignore>=[:Mn::Me::Cf:];" 122 "<ignore>=[:Cf:];" 123 124 + "<enclosing>=[:Mn::Me:];" 125 126 // Hindi phrase separator, kanji, katakana, hiragana, CJK diacriticals, 127 // other letters, and digits 128 + "<danda>=[\u0964\u0965];" 129 + "<kanji>=[\u3005\u4e00-\u9fa5\uf900-\ufa2d];" 130 + "<kata>=[\u30a1-\u30fa\u30fd\u30fe];" 131 + "<hira>=[\u3041-\u3094\u309d\u309e];" 132 + "<cjk-diacrit>=[\u3099-\u309c\u30fb\u30fc];" 133 + "<letter-base>=[:L::Mc:^[<kanji><kata><hira><cjk-diacrit>]];" 134 + "<let>=(<letter-base><enclosing>*);" 135 + "<digit-base>=[:N:];" 136 + "<dgt>=(<digit-base><enclosing>*);" 137 138 // punctuation that can occur in the middle of a word: currently 139 // dashes, apostrophes, quotation marks, and periods 140 + "<mid-word>=[:Pd::Pc:\u00ad\u2027\\\"\\\'\\.];" 141 142 // punctuation that can occur in the middle of a number: currently 143 // apostrophes, qoutation marks, periods, commas, and the Arabic 144 // decimal point 145 + "<mid-num>=[\\\"\\\'\\,\u066b\\.];" 146 147 // punctuation that can occur at the beginning of a number: currently 148 // the period, the number sign, and all currency symbols except the cents sign 149 + "<pre-num>=[:Sc:\\#\\.^\u00a2];" 150 151 // punctuation that can occur at the end of a number: currently 152 // the percent, per-thousand, per-ten-thousand, and Arabic percent 153 // signs, the cents sign, and the ampersand 154 + "<post-num>=[\\%\\&\u00a2\u066a\u2030\u2031];" 155 156 // line separators: currently LF, FF, PS, and LS 157 + "<ls>=[\n\u000c\u2028\u2029];" 158 159 // whitespace: all space separators and the tab character 160 + "<ws-base>=[:Zs:\t];" 161 + "<ws>=(<ws-base><enclosing>*);" 162 163 // a word is a sequence of letters that may contain internal 164 // punctuation, as long as it begins and ends with a letter and 165 // never contains two punctuation marks in a row 166 + "<word>=((<let><let>*(<mid-word><let><let>*)*){<danda>});" 167 168 // a number is a sequence of digits that may contain internal 169 // punctuation, as long as it begins and ends with a digit and 170 // never contains two punctuation marks in a row. 171 + "<number>=(<dgt><dgt>*(<mid-num><dgt><dgt>*)*);" 172 173 // break after every character, with the following exceptions 174 // (this will cause punctuation marks that aren't considered 175 // part of words or numbers to be treated as words unto themselves) 176 + ".;" 177 178 // keep together any sequence of contiguous words and numbers 179 // (including just one of either), plus an optional trailing 180 // number-suffix character 181 + "{<word>}(<number><word>)*{<number>{<post-num>}};" 182 183 // keep together and sequence of contiguous words and numbers 184 // that starts with a number-prefix character and a number, 185 // and may end with a number-suffix character 186 + "<pre-num>(<number><word>)*{<number>{<post-num>}};" 187 188 // keep together runs of whitespace (optionally with a single trailing 189 // line separator or CRLF sequence) 190 + "<ws>*{\r}{<ls>};" 191 192 // keep together runs of Katakana and CJK diacritical marks 193 + "[<kata><cjk-diacrit>]*;" 194 195 // keep together runs of Hiragana and CJK diacritical marks 196 + "[<hira><cjk-diacrit>]*;" 197 198 // keep together runs of Kanji 199 + "<kanji>*;" 200 201 // keep together anything else and an enclosing mark 202 + "<base>=[^<enclosing>^[:Cc::Cf::Zl::Zp:]];" 203 + "<base><enclosing><enclosing>*;" 204 }, 205 206 // default rules for determining legal line-breaking positions 207 { "LineBreakRules", 208 // characters that always cause a break: ETX, tab, LF, FF, LS, and PS 209 "<break>=[\u0003\t\n\f\u2028\u2029];" 210 211 // ignore format characters and control characters EXCEPT for breaking chars 212 + "<ignore>=[:Cf:[:Cc:^[<break>\r]]];" 213 214 // enclosing marks 215 + "<enclosing>=[:Mn::Me:];" 216 217 // Hindi phrase separators 218 + "<danda>=[\u0964\u0965];" 219 220 // characters that always prevent a break: the non-breaking space 221 // and similar characters 222 + "<glue>=[\u00a0\u0f0c\u2007\u2011\u202f\ufeff];" 223 224 // whitespace: space separators and control characters, except for 225 // CR and the other characters mentioned above 226 + "<space>=[:Zs::Cc:^[<glue><break>\r]];" 227 228 // dashes: dash punctuation and the discretionary hyphen, except for 229 // non-breaking hyphens 230 + "<dash>=[:Pd:\u00ad^<glue>];" 231 232 // characters that stick to a word if they precede it: currency symbols 233 // (except the cents sign) and starting punctuation 234 + "<pre-word>=[:Sc::Ps::Pi:^[\u00a2]\\\"\\\'];" 235 236 // characters that stick to a word if they follow it: ending punctuation, 237 // other punctuation that usually occurs at the end of a sentence, 238 // small Kana characters, some CJK diacritics, etc. 239 + "<post-word>=[\\\":Pe::Pf:\\!\\%\\.\\,\\:\\;\\?\u00a2\u00b0\u066a\u2030-\u2034\u2103" 240 + "\u2105\u2109\u3001\u3002\u3005\u3041\u3043\u3045\u3047\u3049\u3063" 241 + "\u3083\u3085\u3087\u308e\u3099-\u309e\u30a1\u30a3\u30a5\u30a7\u30a9" 242 + "\u30c3\u30e3\u30e5\u30e7\u30ee\u30f5\u30f6\u30fc-\u30fe\uff01\uff05" 243 + "\uff0c\uff0e\uff1a\uff1b\uff1f];" 244 245 // Kanji: actually includes Kanji,Kana and Hangul syllables, 246 // except for small Kana and CJK diacritics 247 + "<kanji>=[\u4e00-\u9fa5\uac00-\ud7a3\uf900-\ufa2d\ufa30-\ufa6a\u3041-\u3094\u30a1-\u30fa^[<post-word><ignore>]];" 248 249 // digits 250 + "<digit>=[:Nd::No:];" 251 252 // punctuation that can occur in the middle of a number: periods and commas 253 + "<mid-num>=[\\.\\,];" 254 255 // everything not mentioned above 256 + "<char>=[^[<break><space><dash><kanji><glue><ignore><pre-word><post-word><mid-num>\r<danda>]];" 257 258 // a "number" is a run of prefix characters and dashes, followed by one or 259 // more digits with isolated number-punctuation characters interspersed 260 + "<number>=([<pre-word><dash>]*<digit><digit>*(<mid-num><digit><digit>*)*);" 261 262 // the basic core of a word can be either a "number" as defined above, a single 263 // "Kanji" character, or a run of any number of not-explicitly-mentioned 264 // characters (this includes Latin letters) 265 + "<word-core>=(<char>*|<kanji>|<number>);" 266 267 // a word may end with an optional suffix that be either a run of one or 268 // more dashes or a run of word-suffix characters 269 + "<word-suffix>=((<dash><dash>*|<post-word>*));" 270 271 // a word, thus, is an optional run of word-prefix characters, followed by 272 // a word core and a word suffix (the syntax of <word-core> and <word-suffix> 273 // actually allows either of them to match the empty string, putting a break 274 // between things like ")(" or "aaa(aaa" 275 + "<word>=(<pre-word>*<word-core><word-suffix>);" 276 277 + "<hack1>=[\\(];" 278 + "<hack2>=[\\)];" 279 + "<hack3>=[\\$\\'];" 280 281 // finally, the rule that does the work: Keep together any run of words that 282 // are joined by runs of one of more non-spacing mark. Also keep a trailing 283 // line-break character or CRLF combination with the word. (line separators 284 // "win" over nbsp's) 285 + "<word>(((<space>*<glue><glue>*{<space>})|<hack3>)<word>)*<space>*{<enclosing>*}{<hack1><hack2><post-word>*}{<enclosing>*}{\r}{<break>};" 286 + "\r<break>;" 287 }, 288 289 // default rules for finding sentence boundaries 290 { "SentenceBreakRules", 291 // ignore non-spacing marks, enclosing marks, and format characters 292 "<ignore>=[:Mn::Me::Cf:];" 293 294 // letters 295 + "<letter>=[:L:];" 296 297 // lowercase letters 298 + "<lc>=[:Ll:];" 299 300 // uppercase letters 301 + "<uc>=[:Lu:];" 302 303 // NOT lowercase letters 304 + "<notlc>=[<letter>^<lc>];" 305 306 // whitespace (line separators are treated as whitespace) 307 + "<space>=[\t\r\f\n\u2028:Zs:];" 308 309 // punctuation which may occur at the beginning of a sentence: "starting 310 // punctuation" and quotation marks 311 + "<start-punctuation>=[:Ps::Pi:\\\"\\\'];" 312 313 // punctuation with may occur at the end of a sentence: "ending punctuation" 314 // and quotation marks 315 + "<end>=[:Pe::Pf:\\\"\\\'];" 316 317 // digits 318 + "<digit>=[:N:];" 319 320 // characters that unambiguously signal the end of a sentence 321 + "<term>=[\\!\\?\u3002\uff01\uff1f];" 322 323 // periods, which MAY signal the end of a sentence 324 + "<period>=[\\.\uff0e];" 325 326 // characters that may occur at the beginning of a sentence: basically anything 327 // not mentioned above (letters and digits are specifically excluded) 328 + "<sent-start>=[^[:L:<space><start-punctuation><end><digit><term><period>\u2029<ignore>]];" 329 330 // Hindi phrase separator 331 + "<danda>=[\u0964\u0965];" 332 333 // always break sentences after paragraph separators 334 + ".*?{\u2029};" 335 336 // always break after a danda, if it's followed by whitespace 337 + ".*?<danda><space>*;" 338 339 // if you see a period, skip over additional periods and ending punctuation 340 // and if the next character is a paragraph separator, break after the 341 // paragraph separator 342 //+ ".*?<period>[<period><end>]*<space>*\u2029;" 343 //+ ".*?[<period><end>]*<space>*\u2029;" 344 345 // if you see a period, skip over additional periods and ending punctuation, 346 // followed by optional whitespace, followed by optional starting punctuation, 347 // and if the next character is something that can start a sentence 348 // (basically, a capital letter), then put the sentence break between the 349 // whitespace and the opening punctuation 350 + ".*?<period>[<period><end>]*<space><space>*/<notlc>;" 351 + ".*?<period>[<period><end>]*<space>*/[<start-punctuation><sent-start>][<start-punctuation><sent-start>]*<letter>;" 352 353 // if you see a sentence-terminating character, skip over any additional 354 // terminators, periods, or ending punctuation, followed by any whitespace, 355 // followed by a SINGLE optional paragraph separator, and put the break there 356 + ".*?<term>[<term><period><end>]*<space>*{\u2029};" 357 358 // The following rules are here to aid in backwards iteration. The automatically 359 // generated backwards state table will rewind to the beginning of the 360 // paragraph all the time (or all the way to the beginning of the document 361 // if the document doesn't use the Unicode PS character) because the only 362 // unambiguous character pairs are those involving paragraph separators. 363 // These specify a few more unambiguous breaking situations. 364 365 // if you see a sentence-starting character, followed by starting punctuation 366 // (remember, we're iterating backwards), followed by an optional run of 367 // whitespace, followed by an optional run of ending punctuation, followed 368 // by a period, this is a safe place to turn around 369 + "!<sent-start><start-punctuation>*<space>*<end>*<period>;" 370 371 // if you see a letter or a digit, followed by an optional run of 372 // starting punctuation, followed by an optional run of whitespace, 373 // followed by an optional run of ending punctuation, followed by 374 // a sentence terminator, this is a safe place to turn around 375 + "![<sent-start><lc><digit>]<start-punctuation>*<space>*<end>*<term>;" 376 } 377 }; 378 } 379 }